package au.com.acpfg.io.genbank.reader;
import java.util.HashMap;
import java.util.Map;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* Responsible for storage of all the state from the input file as scanned during the execute method.
* Probably not a thread-safe implementation, improvements welcome...
*
* @author andrew.cassin
*
*/
public class GenbankRecord {
// locus header fields
private String m_locus_name;
private String m_locus_length;
private String m_molecule_type;
private String m_last_modified;
private String m_category;
private String m_definition;
private String m_taxonomy; // and lineage
private String m_sequence;
private String m_comment;
private String m_accession; // from ACCESSION section
private String m_version;
// regular expressions for key parts of each record
private final static Pattern splitter = Pattern.compile("[\\r\\n]([A-Z]+)\\s", Pattern.MULTILINE);
private final static Pattern locus_match = Pattern.compile("^LOCUS\\s+(\\w+)\\s+(\\d+\\s+[a-z]+)\\s+(\\w+)\\s+(.*?)\\s+(\\S+)\\s*$", Pattern.MULTILINE);
/**
* Processes the string record (in multi-line genbank format) from the file into the member
* variables. Code here must be fast rather than pretty since genbank is pretty large ;-)
* Code here can assume the first line begins with 'LOCUS' and the record terminator (//) has been removed
* but should make no other assumptions about the record
*
* @param rec
* @throws InvalidGenbankRecordException
*/
public GenbankRecord(StringBuffer rec_sb, GenbankFeatureListener l) throws InvalidGenbankRecordException {
String rec = rec_sb.toString();
String locus_tok[] = rec.substring(0, rec.indexOf('\n')).split("\\s+");
if (locus_tok.length < 7) {
throw new InvalidGenbankRecordException("Cannot match locus: <"+rec.substring(0, 80)+">");
} else {
set_locus_name(locus_tok[1]);
if (locus_tok[3].equals("aa") || locus_tok[3].equals("bp")) {
set_locus_length(locus_tok[2]+locus_tok[3]);
} else {
set_locus_length(locus_tok[2]);
}
set_molecule_type(locus_tok[4]+' '+locus_tok[5]);
set_division(locus_tok[6]);
set_last_modified(locus_tok[locus_tok.length-1]);
}
Matcher m = splitter.matcher(rec);
while (m.find()) {
String tag = m.group(1);
int end_of_tag = m.end(1);
if (tag.equals("DEFINITION")) {
int end_tag = find_end_tag(rec, m.start(1));
if (end_tag > 0)
set_definition(rec.substring(end_of_tag, end_tag));
} else if (tag.equals("SOURCE")) {
int end_tag = find_end_tag(rec, m.start(1));
if (end_tag > 0)
set_taxonomy(rec.substring(end_of_tag, end_tag));
} else if (tag.equals("ORIGIN")) {
int end_tag = find_end_tag(rec, m.start(1));
if (end_tag > 0)
set_sequence(rec.substring(end_of_tag, end_tag));
} else if (tag.equals("COMMENT")) {
int end_tag = find_end_tag(rec, m.start(1));
if (end_tag > 0) {
set_comment(rec.substring(end_of_tag, end_tag));
}
} else if (tag.equals("ACCESSION")) {
int end_tag = find_end_tag(rec, m.start(1));
if (end_tag > 0) {
set_accession(rec.substring(end_of_tag, end_tag).trim());
}
} else if (tag.equals("VERSION")) {
int end_tag = find_end_tag(rec, m.start(1));
if (end_tag > 0) {
set_version(rec.substring(end_of_tag, end_tag).trim());
}
} else if (tag.equals("FEATURES")) {
if (l != null)
process_features(rec, end_of_tag, find_end_tag(rec, m.start(1)), l);
} else {
// BUG: silent ignore for now...
}
}
}
/**
* Need to be fast here
* This method must invoke the listener once for each sub-section eg. source/CDS/...
*
* @param rec
* @param end_of_tag
* @param find_end_tag
* @throws InvalidGenbankRecordException
*/
private void process_features(String rec, int end_of_tag, int find_end_tag, GenbankFeatureListener l) throws InvalidGenbankRecordException {
assert(end_of_tag < find_end_tag && find_end_tag > 0);
String feature_section = rec.substring(end_of_tag, find_end_tag);
final int feature_len = feature_section.length();
int offset = 0;
SectionStart first = null;
SectionStart last = null;
while ((offset = feature_section.indexOf('\n', offset)) >= 0) {
int n_spaces = 0;
// find a series of whitespaces followed by a word
while (offset < feature_len && Character.isWhitespace(feature_section.charAt(offset))) {
n_spaces++;
offset++;
}
if (n_spaces > 0 && n_spaces < 10) { // probably a start of a sub-section
StringBuffer tag = new StringBuffer();
while (offset < feature_len && Character.isLetter(feature_section.charAt(offset))) {
tag.append(feature_section.charAt(offset));
offset++;
}
String key = tag.toString().toLowerCase();
SectionStart ss = new SectionStart(key, offset, -1, null);
// keep the headings as a forward-only linked list
if (first == null)
first = ss;
if (last != null) {
last.set_len(offset - last.get_start() - key.length());
last.set_next(ss);
}
last = ss;
}
offset++;
}
// this is not setup by the above code, so...
last.set_len(feature_len - last.get_start());
// now invoke the listener with the results of the parse
for (SectionStart ss = first; ss != null; ss = ss.get_next()) {
int start = ss.get_start();
l.parse_section(ss.get_title(), get_accession(), feature_section.substring(start, start+ss.get_len()));
}
}
/**
* Responsible for quickly computing where the current tag ends (character offset in <code>rec</code>)
* the current section, based on the supplied record. In other words looks for the next tag or the
* end of the record and returns the minimum of the two
*
* @param rec
* @param start
* @return
*/
private int find_end_tag(String rec, int start) {
// a section might look like this:
// DESCRIPTION .... blah blah blah...
// more comments here
// NEXT TAG
// we must compute the position of NEXT TAG or the end of the record
int end = start;
int len = rec.length();
while (end < len) {
end = rec.indexOf('\n', end);
if ((end < 0) || (end+1 >= len)) {
return len; // end of record is the end of the section
} else {
char c = rec.charAt(end+1);
if (Character.isLetter(c) && Character.isUpperCase(c)) { // new section starting?
return end;
}
end = end+1;
}
}
return len;
}
public void set_locus_name(String m_locus_name) {
this.m_locus_name = m_locus_name;
}
public String get_locus_name() {
return m_locus_name;
}
public void set_locus_length(String m_locus_length) {
this.m_locus_length = m_locus_length;
}
public String get_locus_length() {
return m_locus_length;
}
public void set_molecule_type(String m_molecule_type) {
this.m_molecule_type = m_molecule_type;
}
public String get_molecule_type() {
return m_molecule_type;
}
public void set_last_modified(String m_last_modified) {
this.m_last_modified = m_last_modified;
}
public String get_last_modified() {
return m_last_modified;
}
public void set_division(String m_category) {
this.m_category = m_category;
}
public String get_division() {
return m_category;
}
public void set_definition(String m_definition) {
this.m_definition = m_definition;
}
public String get_definition() {
return m_definition;
}
public void set_taxonomy(String m_taxonomy) {
this.m_taxonomy = m_taxonomy;
}
public String get_taxonomy() {
return m_taxonomy;
}
public void set_sequence(String m_sequence) {
this.m_sequence = m_sequence;
}
/**
* Returns the raw "unedited" sequence data from the record
* @return
*/
public String get_sequence() {
return m_sequence;
}
/**
* Returns only residues comprising the sequence. No case conversion is performed.
* See <code>get_sequence()</code> for the getting at the raw "origin" entry in the genbank record
*
* @return null if there is no sequence in the genbank record, residues only otherwise
*/
public String get_filtered_sequence() {
StringBuffer ret = new StringBuffer(1024);
if (m_sequence == null)
return null;
for (int i=0; i<m_sequence.length(); i++) {
char c = m_sequence.charAt(i);
if (Character.isLetter(c))
ret.append(c);
}
return ret.toString();
}
public void set_comment(String m_comment) {
this.m_comment = m_comment;
}
public String get_comment() {
return m_comment;
}
public void set_accession(String m_accession) {
this.m_accession = m_accession;
}
public String get_accession() {
return m_accession;
}
public void set_version(String m_version) {
this.m_version = m_version;
}
public String get_version() {
return m_version;
}
}